package cn.tonyandmoney.lib.webmagic.biz

import us.codecraft.webmagic.Page
import us.codecraft.webmagic.Site
import us.codecraft.webmagic.processor.PageProcessor

/**
 * Created by niantuo on 2019/2/14.
 * demo 爬取GitHub上面的仓库
 */

class GithubRepoPageProcessor:PageProcessor {

    override fun getSite(): Site {
        return Site.me()
                .setRetryTimes(3)
                .setSleepTime(1000)
    }

    override fun process(page: Page) {
        page.addTargetRequests(page.html.links().regex("(https://github\\.com/\\w+/\\w+)").all())
        page.putField("author", page.url.regex("https://github\\.com/(\\w+)/.*").toString())
        page.putField("name", page.html.xpath("//h1[@class='public']/strong/a/text()").toString())

        val name:String? = page.resultItems["name"]

        if (name.isNullOrBlank()){
            page.setSkip(true)
        }
        page.putField("readme", page.html.xpath("//div[@id='readme']/tidyText()"))
    }

}